{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\zake7\\Anaconda3\\lib\\site-packages\\gensim\\utils.py:1197: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
      "  warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n",
      "Using TensorFlow backend.\n",
      "C:\\Users\\zake7\\Anaconda3\\lib\\site-packages\\fuzzywuzzy\\fuzz.py:35: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning\n",
      "  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import re\n",
    "import csv\n",
    "import codecs\n",
    "import gensim\n",
    "import itertools\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import operator\n",
    "import sys\n",
    "\n",
    "from nltk import ngrams\n",
    "from collections import Counter\n",
    "from string import punctuation\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "\n",
    "from iwillwin.trainer.supervised_trainer import KerasModelTrainer\n",
    "from iwillwin.data_utils.data_helpers import DataTransformer, DataLoader\n",
    "from iwillwin.config import dataset_config\n",
    "from iwillwin.data_utils.feature_engineering import FeatureCreator, CharFeatureCreator\n",
    "from simhash import Simhash\n",
    "\n",
    "from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import log_loss, make_scorer\n",
    "from sklearn.decomposition import TruncatedSVD\n",
    "\n",
    "from fuzzywuzzy import fuzz\n",
    "from nltk.corpus import stopwords\n",
    "from tqdm import tqdm\n",
    "from scipy.stats import skew, kurtosis\n",
    "from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis\n",
    "from nltk import word_tokenize\n",
    "\n",
    "import seaborn as sns\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Phase 1 Feature Engineering"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Clean the texts and drop the duplicate pairs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data_loader = DataLoader()\n",
    "\n",
    "spn_train_df = data_loader.load_dataset(dataset_config.DATASET_TRAIN_PATH, names=None)\n",
    "test_df = data_loader.load_dataset(dataset_config.DATASET_TEST_PATH, names=None)\n",
    "\n",
    "train_df = spn_train_df\n",
    "train_df = train_df.drop_duplicates()\n",
    "\n",
    "def preprocessing(text, clean_wiki_tokens=True, drop_image=True):\n",
    "    if type(text) == float:\n",
    "        return 'error'\n",
    "    text = re.sub(r\"\\<i\\>\", \" \", text)\n",
    "    text = re.sub(r\"|\", \" \", text)\n",
    "    text = re.sub(r\";\", \" \", text)\n",
    "    text = re.sub(r\"’\", \"'\", text)\n",
    "    text = re.sub(r\"‘\", \"'\", text)\n",
    "    text = re.sub(r\"!\", \" ! \", text)\n",
    "    text = re.sub(r\"¿\", \" ¿ \", text)\n",
    "    text = re.sub(r\",\", \" \", text)\n",
    "    text = re.sub(r\"–\", \" \", text)\n",
    "    text = re.sub(r\"−\", \" \", text)\n",
    "    text = re.sub(r\"\\.\", \" \", text)\n",
    "    text = re.sub(r\"!\", \" ! \", text)\n",
    "    text = re.sub(r\"\\/\", \" \", text)\n",
    "    text = re.sub(r\"_\", \" \", text)\n",
    "    text = re.sub(r\"\\?\", \" ? \", text)\n",
    "    text = re.sub(r\"？\", \" ? \", text)\n",
    "    text = re.sub(r\"\\^\", \" ^ \", text)\n",
    "    text = re.sub(r\"\\+\", \" + \", text)\n",
    "    text = re.sub(r\"\\-\", \" - \", text)\n",
    "    text = re.sub(r\"\\=\", \" = \", text)\n",
    "    text = re.sub(r\"#\", \" # \", text)\n",
    "\n",
    "    text = re.sub(r\"'\", \" \", text)\n",
    "    \n",
    "    return text\n",
    "\n",
    "dfs = [train_df, test_df]\n",
    "\n",
    "train_df['spn_1'] = train_df['title1_zh']\n",
    "train_df['spn_2'] = train_df['title2_zh']\n",
    "\n",
    "test_df['spn_1'] = test_df['title1_zh']\n",
    "test_df['spn_2'] = test_df['title2_zh']\n",
    "\n",
    "for df in [train_df, test_df,]:\n",
    "    df['raw_spn_1'] = df['spn_1'].values\n",
    "    df['raw_spn_2'] = df['spn_2'].values\n",
    "\n",
    "for df in dfs:\n",
    "    df['spn_1'] = df['spn_1'].apply(lambda v: preprocessing(v))\n",
    "    df['spn_2'] = df['spn_2'].apply(lambda v: preprocessing(v))\n",
    "\n",
    "train_df.to_csv(dataset_config.PROCESSED_CHARS_TRAIN_SET, index=False, encoding='utf-8')\n",
    "test_df.to_csv(dataset_config.PROCESSED_CHARS_TEST_SET, index=False, encoding='utf-8')    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "NB_WORDS = 10000\n",
    "EMBEDDING_DIM = 300\n",
    "MAX_SEQUENCE_LENGTH = 50\n",
    "OUT_SIZE = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "processed_on = 'RAW'\n",
    "\n",
    "train_path = dataset_config.PROCESSED_CHARS_TRAIN_SET\n",
    "test_path = dataset_config.PROCESSED_CHARS_TEST_SET\n",
    "\n",
    "train_df = pd.read_csv(train_path)\n",
    "test_df = pd.read_csv(test_path)\n",
    "data_loader = DataLoader()\n",
    "dfs = [train_df, test_df]\n",
    "\n",
    "if processed_on == 'RAW':\n",
    "    train_output_path = dataset_config.ENGINEERED_CHARS_TRAIN_SET\n",
    "    test_output_path = dataset_config.ENGINEERED_CHARS_TEST_SET\n",
    "    \n",
    "def split(v):\n",
    "    v = str(v)\n",
    "    return v.split()\n",
    "\n",
    "for df in [train_df, test_df]:\n",
    "    df['splited_spn_1'] = df['spn_1'].apply(lambda v: [e for e in v if e != ' '])\n",
    "    df['splited_spn_2'] = df['spn_2'].apply(lambda v: [e for e in v if e != ' '])    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>tid1</th>\n",
       "      <th>tid2</th>\n",
       "      <th>title1_zh</th>\n",
       "      <th>title2_zh</th>\n",
       "      <th>title1_en</th>\n",
       "      <th>title2_en</th>\n",
       "      <th>label</th>\n",
       "      <th>spn_1</th>\n",
       "      <th>spn_2</th>\n",
       "      <th>raw_spn_1</th>\n",
       "      <th>raw_spn_2</th>\n",
       "      <th>splited_spn_1</th>\n",
       "      <th>splited_spn_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2017养老保险又新增两项，农村老人人人可申领，你领到了吗</td>\n",
       "      <td>警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京</td>\n",
       "      <td>There are two new old-age insurance benefits f...</td>\n",
       "      <td>Police disprove \"bird's nest congress each per...</td>\n",
       "      <td>unrelated</td>\n",
       "      <td>2 0 1 7 养 老 保 险 又 新 增 两 项 ， 农 村 老 人 人 人 可 申...</td>\n",
       "      <td>警 方 辟 谣 “ 鸟 巢 大 会 每 人 领 5 万 ”   仍 有 老 人 坚 持 进 京</td>\n",
       "      <td>2017养老保险又新增两项，农村老人人人可申领，你领到了吗</td>\n",
       "      <td>警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京</td>\n",
       "      <td>[2, 0, 1, 7, 养, 老, 保, 险, 又, 新, 增, 两, 项, ，, 农, ...</td>\n",
       "      <td>[警, 方, 辟, 谣, “, 鸟, 巢, 大, 会, 每, 人, 领, 5, 万, ”, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>\"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港</td>\n",
       "      <td>深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小</td>\n",
       "      <td>\"If you do not come to Shenzhen, sooner or lat...</td>\n",
       "      <td>Shenzhen's GDP outstrips Hong Kong? Shenzhen S...</td>\n",
       "      <td>unrelated</td>\n",
       "      <td>\" 你 不 来 深 圳 ， 早 晚 你 儿 子 也 要 来 \" ， 不 出 1 0 年 深...</td>\n",
       "      <td>深 圳 G D P 首 超 香 港  ?  深 圳 统 计 局 辟 谣 ： 只 是 差 距...</td>\n",
       "      <td>\"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港</td>\n",
       "      <td>深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小</td>\n",
       "      <td>[\", 你, 不, 来, 深, 圳, ，, 早, 晚, 你, 儿, 子, 也, 要, 来, ...</td>\n",
       "      <td>[深, 圳, G, D, P, 首, 超, 香, 港, ?, 深, 圳, 统, 计, 局, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>\"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港</td>\n",
       "      <td>GDP首超香港？深圳澄清：还差一点点……</td>\n",
       "      <td>\"If you do not come to Shenzhen, sooner or lat...</td>\n",
       "      <td>The GDP overtopped Hong Kong? Shenzhen clarifi...</td>\n",
       "      <td>unrelated</td>\n",
       "      <td>\" 你 不 来 深 圳 ， 早 晚 你 儿 子 也 要 来 \" ， 不 出 1 0 年 深...</td>\n",
       "      <td>G D P 首 超 香 港  ?  深 圳 澄 清 ： 还 差 一 点 点 … …</td>\n",
       "      <td>\"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港</td>\n",
       "      <td>GDP首超香港？深圳澄清：还差一点点……</td>\n",
       "      <td>[\", 你, 不, 来, 深, 圳, ，, 早, 晚, 你, 儿, 子, 也, 要, 来, ...</td>\n",
       "      <td>[G, D, P, 首, 超, 香, 港, ?, 深, 圳, 澄, 清, ：, 还, 差, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>\"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港</td>\n",
       "      <td>去年深圳GDP首超香港？深圳统计局辟谣：还差611亿</td>\n",
       "      <td>\"If you do not come to Shenzhen, sooner or lat...</td>\n",
       "      <td>Shenzhen's GDP topped Hong Kong last year? She...</td>\n",
       "      <td>unrelated</td>\n",
       "      <td>\" 你 不 来 深 圳 ， 早 晚 你 儿 子 也 要 来 \" ， 不 出 1 0 年 深...</td>\n",
       "      <td>去 年 深 圳 G D P 首 超 香 港  ?  深 圳 统 计 局 辟 谣 ： 还 差...</td>\n",
       "      <td>\"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港</td>\n",
       "      <td>去年深圳GDP首超香港？深圳统计局辟谣：还差611亿</td>\n",
       "      <td>[\", 你, 不, 来, 深, 圳, ，, 早, 晚, 你, 儿, 子, 也, 要, 来, ...</td>\n",
       "      <td>[去, 年, 深, 圳, G, D, P, 首, 超, 香, 港, ?, 深, 圳, 统, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>9</td>\n",
       "      <td>6</td>\n",
       "      <td>7</td>\n",
       "      <td>\"用大蒜鉴别地沟油的方法,怎么鉴别地沟油</td>\n",
       "      <td>吃了30年食用油才知道，一片大蒜轻松鉴别地沟油</td>\n",
       "      <td>\"How to discriminate oil from gutter oil by me...</td>\n",
       "      <td>It took 30 years of cooking oil to know that o...</td>\n",
       "      <td>agreed</td>\n",
       "      <td>\" 用 大 蒜 鉴 别 地 沟 油 的 方 法   怎 么 鉴 别 地 沟 油</td>\n",
       "      <td>吃 了 3 0 年 食 用 油 才 知 道 ， 一 片 大 蒜 轻 松 鉴 别 地 沟 油</td>\n",
       "      <td>\"用大蒜鉴别地沟油的方法,怎么鉴别地沟油</td>\n",
       "      <td>吃了30年食用油才知道，一片大蒜轻松鉴别地沟油</td>\n",
       "      <td>[\", 用, 大, 蒜, 鉴, 别, 地, 沟, 油, 的, 方, 法, 怎, 么, 鉴, ...</td>\n",
       "      <td>[吃, 了, 3, 0, 年, 食, 用, 油, 才, 知, 道, ，, 一, 片, 大, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  tid1  tid2                          title1_zh  \\\n",
       "0   0     0     1      2017养老保险又新增两项，农村老人人人可申领，你领到了吗   \n",
       "1   3     2     3  \"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港   \n",
       "2   1     2     4  \"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港   \n",
       "3   2     2     5  \"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港   \n",
       "4   9     6     7               \"用大蒜鉴别地沟油的方法,怎么鉴别地沟油   \n",
       "\n",
       "                    title2_zh  \\\n",
       "0    警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京   \n",
       "1   深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小   \n",
       "2        GDP首超香港？深圳澄清：还差一点点……   \n",
       "3  去年深圳GDP首超香港？深圳统计局辟谣：还差611亿   \n",
       "4     吃了30年食用油才知道，一片大蒜轻松鉴别地沟油   \n",
       "\n",
       "                                           title1_en  \\\n",
       "0  There are two new old-age insurance benefits f...   \n",
       "1  \"If you do not come to Shenzhen, sooner or lat...   \n",
       "2  \"If you do not come to Shenzhen, sooner or lat...   \n",
       "3  \"If you do not come to Shenzhen, sooner or lat...   \n",
       "4  \"How to discriminate oil from gutter oil by me...   \n",
       "\n",
       "                                           title2_en      label  \\\n",
       "0  Police disprove \"bird's nest congress each per...  unrelated   \n",
       "1  Shenzhen's GDP outstrips Hong Kong? Shenzhen S...  unrelated   \n",
       "2  The GDP overtopped Hong Kong? Shenzhen clarifi...  unrelated   \n",
       "3  Shenzhen's GDP topped Hong Kong last year? She...  unrelated   \n",
       "4  It took 30 years of cooking oil to know that o...     agreed   \n",
       "\n",
       "                                               spn_1  \\\n",
       "0     2 0 1 7 养 老 保 险 又 新 增 两 项 ， 农 村 老 人 人 人 可 申...   \n",
       "1   \" 你 不 来 深 圳 ， 早 晚 你 儿 子 也 要 来 \" ， 不 出 1 0 年 深...   \n",
       "2   \" 你 不 来 深 圳 ， 早 晚 你 儿 子 也 要 来 \" ， 不 出 1 0 年 深...   \n",
       "3   \" 你 不 来 深 圳 ， 早 晚 你 儿 子 也 要 来 \" ， 不 出 1 0 年 深...   \n",
       "4           \" 用 大 蒜 鉴 别 地 沟 油 的 方 法   怎 么 鉴 别 地 沟 油    \n",
       "\n",
       "                                               spn_2  \\\n",
       "0   警 方 辟 谣 “ 鸟 巢 大 会 每 人 领 5 万 ”   仍 有 老 人 坚 持 进 京    \n",
       "1   深 圳 G D P 首 超 香 港  ?  深 圳 统 计 局 辟 谣 ： 只 是 差 距...   \n",
       "2         G D P 首 超 香 港  ?  深 圳 澄 清 ： 还 差 一 点 点 … …    \n",
       "3   去 年 深 圳 G D P 首 超 香 港  ?  深 圳 统 计 局 辟 谣 ： 还 差...   \n",
       "4     吃 了 3 0 年 食 用 油 才 知 道 ， 一 片 大 蒜 轻 松 鉴 别 地 沟 油    \n",
       "\n",
       "                           raw_spn_1                   raw_spn_2  \\\n",
       "0      2017养老保险又新增两项，农村老人人人可申领，你领到了吗    警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京   \n",
       "1  \"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港   深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小   \n",
       "2  \"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港        GDP首超香港？深圳澄清：还差一点点……   \n",
       "3  \"你不来深圳，早晚你儿子也要来\"，不出10年深圳人均GDP将超香港  去年深圳GDP首超香港？深圳统计局辟谣：还差611亿   \n",
       "4               \"用大蒜鉴别地沟油的方法,怎么鉴别地沟油     吃了30年食用油才知道，一片大蒜轻松鉴别地沟油   \n",
       "\n",
       "                                       splited_spn_1  \\\n",
       "0  [2, 0, 1, 7, 养, 老, 保, 险, 又, 新, 增, 两, 项, ，, 农, ...   \n",
       "1  [\", 你, 不, 来, 深, 圳, ，, 早, 晚, 你, 儿, 子, 也, 要, 来, ...   \n",
       "2  [\", 你, 不, 来, 深, 圳, ，, 早, 晚, 你, 儿, 子, 也, 要, 来, ...   \n",
       "3  [\", 你, 不, 来, 深, 圳, ，, 早, 晚, 你, 儿, 子, 也, 要, 来, ...   \n",
       "4  [\", 用, 大, 蒜, 鉴, 别, 地, 沟, 油, 的, 方, 法, 怎, 么, 鉴, ...   \n",
       "\n",
       "                                       splited_spn_2  \n",
       "0  [警, 方, 辟, 谣, “, 鸟, 巢, 大, 会, 每, 人, 领, 5, 万, ”, ...  \n",
       "1  [深, 圳, G, D, P, 首, 超, 香, 港, ?, 深, 圳, 统, 计, 局, ...  \n",
       "2  [G, D, P, 首, 超, 香, 港, ?, 深, 圳, 澄, 清, ：, 还, 差, ...  \n",
       "3  [去, 年, 深, 圳, G, D, P, 首, 超, 香, 港, ?, 深, 圳, 统, ...  \n",
       "4  [吃, 了, 3, 0, 年, 食, 用, 油, 才, 知, 道, ，, 一, 片, 大, ...  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>tid1</th>\n",
       "      <th>tid2</th>\n",
       "      <th>title1_zh</th>\n",
       "      <th>title2_zh</th>\n",
       "      <th>title1_en</th>\n",
       "      <th>title2_en</th>\n",
       "      <th>spn_1</th>\n",
       "      <th>spn_2</th>\n",
       "      <th>raw_spn_1</th>\n",
       "      <th>raw_spn_2</th>\n",
       "      <th>splited_spn_1</th>\n",
       "      <th>splited_spn_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>321187</td>\n",
       "      <td>167562</td>\n",
       "      <td>59521</td>\n",
       "      <td>萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大</td>\n",
       "      <td>辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？</td>\n",
       "      <td>egypt 's presidential election failed to win m...</td>\n",
       "      <td>Lyon! Lyon officials have denied that Felipe F...</td>\n",
       "      <td>萨 拉 赫 人 气 爆 棚   !   埃 及 总 统 大 选 未 参 选 获 百 万 选...</td>\n",
       "      <td>辟 谣 ！ 里 昂 官 方 否 认 费 基 尔 加 盟 利 物 浦 ， 难 道 是 价 格...</td>\n",
       "      <td>萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大</td>\n",
       "      <td>辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？</td>\n",
       "      <td>[萨, 拉, 赫, 人, 气, 爆, 棚, !, 埃, 及, 总, 统, 大, 选, 未, ...</td>\n",
       "      <td>[辟, 谣, ！, 里, 昂, 官, 方, 否, 认, 费, 基, 尔, 加, 盟, 利, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>321190</td>\n",
       "      <td>167564</td>\n",
       "      <td>91315</td>\n",
       "      <td>萨达姆被捕后告诫美国的一句话，发人深思</td>\n",
       "      <td>10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国</td>\n",
       "      <td>A message from Saddam Hussein after he was cap...</td>\n",
       "      <td>The Top 10 Americans believe that the Lizard M...</td>\n",
       "      <td>萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思</td>\n",
       "      <td>1 0 大 最 让 美 国 人 相 信 的 荒 诞 谣 言 ， 如 蜥 蜴 人 掌 控 着...</td>\n",
       "      <td>萨达姆被捕后告诫美国的一句话，发人深思</td>\n",
       "      <td>10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国</td>\n",
       "      <td>[萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...</td>\n",
       "      <td>[1, 0, 大, 最, 让, 美, 国, 人, 相, 信, 的, 荒, 诞, 谣, 言, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>321189</td>\n",
       "      <td>167563</td>\n",
       "      <td>167564</td>\n",
       "      <td>萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗</td>\n",
       "      <td>萨达姆被捕后告诫美国的一句话，发人深思</td>\n",
       "      <td>Will the United States wage war on Iraq withou...</td>\n",
       "      <td>A message from Saddam Hussein after he was cap...</td>\n",
       "      <td>萨 达 姆 此 项 计 划 没 有 此 国 破 坏 的 话 ， 美 国 还 会 对 伊 拉...</td>\n",
       "      <td>萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思</td>\n",
       "      <td>萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗</td>\n",
       "      <td>萨达姆被捕后告诫美国的一句话，发人深思</td>\n",
       "      <td>[萨, 达, 姆, 此, 项, 计, 划, 没, 有, 此, 国, 破, 坏, 的, 话, ...</td>\n",
       "      <td>[萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>321193</td>\n",
       "      <td>167564</td>\n",
       "      <td>160994</td>\n",
       "      <td>萨达姆被捕后告诫美国的一句话，发人深思</td>\n",
       "      <td>被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！</td>\n",
       "      <td>A message from Saddam Hussein after he was cap...</td>\n",
       "      <td>The hanging Saddam is a surrogate? This man's ...</td>\n",
       "      <td>萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思</td>\n",
       "      <td>被 绞 刑 处 死 的 萨 达 姆 是 替 身  ?  他 的 此 男 人 举 动 击 破...</td>\n",
       "      <td>萨达姆被捕后告诫美国的一句话，发人深思</td>\n",
       "      <td>被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！</td>\n",
       "      <td>[萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...</td>\n",
       "      <td>[被, 绞, 刑, 处, 死, 的, 萨, 达, 姆, 是, 替, 身, ?, 他, 的, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>321191</td>\n",
       "      <td>167564</td>\n",
       "      <td>15084</td>\n",
       "      <td>萨达姆被捕后告诫美国的一句话，发人深思</td>\n",
       "      <td>中国川贝枇杷膏在美国受到热捧？纯属谣言！</td>\n",
       "      <td>A message from Saddam Hussein after he was cap...</td>\n",
       "      <td>Chinese loquat loquat plaster in America? Pure...</td>\n",
       "      <td>萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思</td>\n",
       "      <td>中 国 川 贝 枇 杷 膏 在 美 国 受 到 热 捧  ?  纯 属 谣 言 ！</td>\n",
       "      <td>萨达姆被捕后告诫美国的一句话，发人深思</td>\n",
       "      <td>中国川贝枇杷膏在美国受到热捧？纯属谣言！</td>\n",
       "      <td>[萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...</td>\n",
       "      <td>[中, 国, 川, 贝, 枇, 杷, 膏, 在, 美, 国, 受, 到, 热, 捧, ?, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       id    tid1    tid2                        title1_zh  \\\n",
       "0  321187  167562   59521  萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大   \n",
       "1  321190  167564   91315              萨达姆被捕后告诫美国的一句话，发人深思   \n",
       "2  321189  167563  167564    萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗   \n",
       "3  321193  167564  160994              萨达姆被捕后告诫美国的一句话，发人深思   \n",
       "4  321191  167564   15084              萨达姆被捕后告诫美国的一句话，发人深思   \n",
       "\n",
       "                     title2_zh  \\\n",
       "0  辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？   \n",
       "1    10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国   \n",
       "2          萨达姆被捕后告诫美国的一句话，发人深思   \n",
       "3  被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！   \n",
       "4         中国川贝枇杷膏在美国受到热捧？纯属谣言！   \n",
       "\n",
       "                                           title1_en  \\\n",
       "0  egypt 's presidential election failed to win m...   \n",
       "1  A message from Saddam Hussein after he was cap...   \n",
       "2  Will the United States wage war on Iraq withou...   \n",
       "3  A message from Saddam Hussein after he was cap...   \n",
       "4  A message from Saddam Hussein after he was cap...   \n",
       "\n",
       "                                           title2_en  \\\n",
       "0  Lyon! Lyon officials have denied that Felipe F...   \n",
       "1  The Top 10 Americans believe that the Lizard M...   \n",
       "2  A message from Saddam Hussein after he was cap...   \n",
       "3  The hanging Saddam is a surrogate? This man's ...   \n",
       "4  Chinese loquat loquat plaster in America? Pure...   \n",
       "\n",
       "                                               spn_1  \\\n",
       "0   萨 拉 赫 人 气 爆 棚   !   埃 及 总 统 大 选 未 参 选 获 百 万 选...   \n",
       "1             萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思    \n",
       "2   萨 达 姆 此 项 计 划 没 有 此 国 破 坏 的 话 ， 美 国 还 会 对 伊 拉...   \n",
       "3             萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思    \n",
       "4             萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思    \n",
       "\n",
       "                                               spn_2  \\\n",
       "0   辟 谣 ！ 里 昂 官 方 否 认 费 基 尔 加 盟 利 物 浦 ， 难 道 是 价 格...   \n",
       "1   1 0 大 最 让 美 国 人 相 信 的 荒 诞 谣 言 ， 如 蜥 蜴 人 掌 控 着...   \n",
       "2             萨 达 姆 被 捕 后 告 诫 美 国 的 一 句 话 ， 发 人 深 思    \n",
       "3   被 绞 刑 处 死 的 萨 达 姆 是 替 身  ?  他 的 此 男 人 举 动 击 破...   \n",
       "4         中 国 川 贝 枇 杷 膏 在 美 国 受 到 热 捧  ?  纯 属 谣 言 ！    \n",
       "\n",
       "                         raw_spn_1                    raw_spn_2  \\\n",
       "0  萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大  辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？   \n",
       "1              萨达姆被捕后告诫美国的一句话，发人深思    10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国   \n",
       "2    萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗          萨达姆被捕后告诫美国的一句话，发人深思   \n",
       "3              萨达姆被捕后告诫美国的一句话，发人深思  被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！   \n",
       "4              萨达姆被捕后告诫美国的一句话，发人深思         中国川贝枇杷膏在美国受到热捧？纯属谣言！   \n",
       "\n",
       "                                       splited_spn_1  \\\n",
       "0  [萨, 拉, 赫, 人, 气, 爆, 棚, !, 埃, 及, 总, 统, 大, 选, 未, ...   \n",
       "1  [萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...   \n",
       "2  [萨, 达, 姆, 此, 项, 计, 划, 没, 有, 此, 国, 破, 坏, 的, 话, ...   \n",
       "3  [萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...   \n",
       "4  [萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...   \n",
       "\n",
       "                                       splited_spn_2  \n",
       "0  [辟, 谣, ！, 里, 昂, 官, 方, 否, 认, 费, 基, 尔, 加, 盟, 利, ...  \n",
       "1  [1, 0, 大, 最, 让, 美, 国, 人, 相, 信, 的, 荒, 诞, 谣, 言, ...  \n",
       "2  [萨, 达, 姆, 被, 捕, 后, 告, 诫, 美, 国, 的, 一, 句, 话, ，, ...  \n",
       "3  [被, 绞, 刑, 处, 死, 的, 萨, 达, 姆, 是, 替, 身, ?, 他, 的, ...  \n",
       "4  [中, 国, 川, 贝, 枇, 杷, 膏, 在, 美, 国, 受, 到, 热, 捧, ?, ...  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No of words in the dictionary = 5244\n",
      "[FE] create the frequency features\n",
      "[FE] creating the IR features\n",
      "[FE] creating the weighted distance features\n",
      "[FE] creating the length features\n",
      "[FE] creating the weight features\n",
      "[FE] creating the distance features\n",
      "[FE] cosine_sim sample= \n",
      " [0.31234752377721214, 0.37921028494152054]\n",
      "[FE] manhattan_dis sample = \n",
      " [44.0, 38.0]\n",
      "[FE] eucledian_dis sample = \n",
      " [6.782329983125268, 6.928203230275509]\n",
      "[FE] jaccard_dis sample = \n",
      " [0, 0]\n",
      "[FE] minkowsk_dis sample = \n",
      " [6.782329983125268, 6.928203230275509]\n",
      "[FE] creating the fuzzy features\n",
      "[FE] creating the topic word features\n",
      "[FE] TODO! Create the graph features\n",
      "[FE] create the frequency features\n",
      "[FE] creating the IR features\n",
      "[FE] creating the weighted distance features\n",
      "[FE] creating the length features\n",
      "[FE] creating the weight features\n",
      "[FE] creating the distance features\n",
      "[FE] cosine_sim sample= \n",
      " [0.0, 0.32963425737213164]\n",
      "[FE] manhattan_dis sample = \n",
      " [57.0, 34.0]\n",
      "[FE] eucledian_dis sample = \n",
      " [8.306623862918075, 5.830951894845301]\n",
      "[FE] jaccard_dis sample = \n",
      " [0, 0]\n",
      "[FE] minkowsk_dis sample = \n",
      " [8.306623862918075, 5.830951894845301]\n",
      "[FE] creating the fuzzy features\n",
      "[FE] creating the topic word features\n",
      "[FE] TODO! Create the graph features\n",
      "[FE] Feature engineered. With features ['id' 'tid1' 'tid2' 'title1_zh' 'title2_zh' 'title1_en' 'title2_en'\n",
      " 'spn_1' 'spn_2' 'raw_spn_1' 'raw_spn_2' 'splited_spn_1' 'splited_spn_2'\n",
      " 'bm25_q1_to_q2' 'bm25_q2_to_q1' 'weighted_cosine_sim' 'len_word_max'\n",
      " 'len_word_min' 'len_char_max' 'len_char_min' 'len_word_q1' 'len_word_q2'\n",
      " 'len_char_q1' 'len_char_q2' 'word_length_diff' 'char_length_diff'\n",
      " 'len_avg_word_1' 'len_avg_word_2' 'avg_word_diff'\n",
      " 'len_diff_remove_stopwords' 'word_shares' 'word_match' 'tfidf_word_match'\n",
      " 'diff_tfidf_word_match' 'shared_count' 'bigram_corr' 'trigram_corr'\n",
      " 'word_match_no_stopwords' 'unique_word_ratio' 'cosine_sim'\n",
      " 'manhattan_dis' 'eucledian_dis' 'jaccard_dis' 'minkowsk_dis'\n",
      " 'fuzzy_ratio' 'fuzzy_set_ratio' 'fuzzy_partial_ratio'\n",
      " 'fuzzy_token_sort_ratio' 'fuzzy_qratio' 'fuzzy_WRatio'\n",
      " 'longest_substr_ratio' 'q1_cómo' 'q2_cómo' 'cómo_both' 'q1_qué' 'q2_qué'\n",
      " 'qué_both']\n",
      "Wall time: 3h 54min 5s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "feature_creator = CharFeatureCreator(train_df, test_df, data_loader, normalization=False)\n",
    "train_df, test_df = feature_creator.create_features()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Features Zoo"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SimHash"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\zake7\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: DeprecationWarning: generator 'ngrams' raised StopIteration\n",
      "  after removing the cwd from sys.path.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Build sim_hash\n",
      "trainset has processed.\n",
      "Build sim_hash\n",
      "testset has processed.\n",
      "Wall time: 1h 43min 56s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "def create_hash_features(df):\n",
    "\n",
    "    def get_word_ngrams(sequence, n=3):\n",
    "        return [' '.join(ngram) for ngram in ngrams(sequence, n)]\n",
    "\n",
    "    def get_character_ngrams(sequence, n=3):\n",
    "        sequence = ' '.join(sequence)\n",
    "        return [sequence[i:i+n] for i in range(len(sequence)-n+1)]\n",
    "\n",
    "    def calculate_simhash_distance(sequence1, sequence2):\n",
    "        return Simhash(sequence1).distance(Simhash(sequence2))\n",
    "\n",
    "    def calculate_all_simhash(row):\n",
    "        q1, q2 = row['splited_spn_1'], row['splited_spn_2']\n",
    "        simhash_distance = calculate_simhash_distance(q1, q2)\n",
    "\n",
    "        q1, q2 = get_word_ngrams(q1, 2), get_word_ngrams(q2, 2)\n",
    "        simhash_distance_2gram = calculate_simhash_distance(q1, q2)\n",
    "\n",
    "        q1, q2 = get_word_ngrams(q1, 3), get_word_ngrams(q2, 3)\n",
    "        simhash_distance_3gram = calculate_simhash_distance(q1, q2)\n",
    "\n",
    "        q1, q2 = get_character_ngrams(q1, 2), get_character_ngrams(q2, 2)\n",
    "        simhash_distance_ch_2gram = calculate_simhash_distance(q1, q2)\n",
    "\n",
    "        q1, q2 = get_character_ngrams(q1, 3), get_character_ngrams(q2, 3)\n",
    "        simhash_distance_ch_3gram = calculate_simhash_distance(q1, q2)\n",
    "\n",
    "        return '{}:{}:{}:{}:{}'.format(simhash_distance, simhash_distance_2gram, simhash_distance_3gram,\n",
    "                                             simhash_distance_ch_2gram, simhash_distance_ch_3gram,)\n",
    "\n",
    "\n",
    "    df['sim_hash'] = df.apply(lambda row: calculate_all_simhash(row), axis=1)\n",
    "    print(\"Build sim_hash\")\n",
    "    df['simhash_distance'] = df['sim_hash'].apply(lambda x: float(x.split(':')[0]))\n",
    "    df['simhash_distance_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[1]))\n",
    "    df['simhash_distance_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[2]))\n",
    "    df['simhash_distance_ch_2gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[3]))\n",
    "    df['simhash_distance_ch_3gram'] = df['sim_hash'].apply(lambda x: float(x.split(':')[4]))\n",
    "    \n",
    "create_hash_features(train_df)\n",
    "print(\"trainset has processed.\")\n",
    "create_hash_features(test_df)\n",
    "print(\"testset has processed.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## JellyFish"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 2h 31min 13s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "import jellyfish\n",
    "import numpy as np\n",
    "def smith_waterman(a, b, alignment_score=1, gap_cost=1):\n",
    "  # H holds the alignment score at each point, computed incrementally\n",
    "    H = np.zeros((len(a) + 1, len(b) + 1))\n",
    "    for i in range(1, len(a) + 1):\n",
    "        for j in range(1, len(b) + 1):\n",
    "        # The score for substituting the letter a[i-1] for b[j-1]. Generally low\n",
    "        # for mismatch, high for match.\n",
    "            match = H[i-1,j -1] + (alignment_score if a[i-1] == b[j-1] else 0)\n",
    "            # The scores for for introducing extra letters in one of the strings (or\n",
    "            # by symmetry, deleting them from the other).\n",
    "            delete = H[1:i,j].max() - gap_cost if i > 1 else 0\n",
    "            insert = H[i,1:j].max() - gap_cost if j > 1 else 0\n",
    "            H[i, j] = max(match, delete, insert, 0)\n",
    "    # The highest score is the best local alignment.\n",
    "    # For our purposes, we don't actually care _what_ the alignment was, just how\n",
    "    # aligned the two strings were.\n",
    "    return H.max()\n",
    "\n",
    "for df in [train_df, test_df]:\n",
    "    df['jellyfish_jaro_winkler_distance'] = df[['spn_1', 'spn_2']].apply(lambda row: jellyfish.jaro_winkler(row['spn_1'], row['spn_2']), axis=1)\n",
    "    df['smith_waterman_distance'] = df[['spn_1', 'spn_2']].apply(lambda row: smith_waterman(row['spn_1'], row['spn_2']), axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Check features correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_df.iloc[-10000:].corr()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Extract training columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "meta_columns = ['bm25_q1_to_q2', 'bm25_q2_to_q1', 'weighted_cosine_sim',\n",
    "       'len_word_max', 'len_word_min', 'len_char_max', 'len_char_min',\n",
    "       'word_length_diff', 'char_length_diff', 'len_diff_remove_stopwords',\n",
    "       'word_match', 'tfidf_word_match', 'shared_count', 'bigram_corr', 'trigram_corr',\n",
    "       'word_match_no_stopwords', 'unique_word_ratio', 'cosine_sim',\n",
    "       'manhattan_dis', 'eucledian_dis', 'jaccard_dis', 'minkowsk_dis',\n",
    "       'fuzzy_ratio', 'fuzzy_set_ratio', 'fuzzy_partial_ratio',\n",
    "       'fuzzy_token_sort_ratio', 'fuzzy_qratio', 'fuzzy_WRatio',\n",
    "       'longest_substr_ratio', 'cómo_both', 'simhash_distance', 'simhash_distance_2gram',\n",
    "       'simhash_distance_3gram', 'simhash_distance_ch_2gram',\n",
    "       'simhash_distance_ch_3gram', 'raw_wmd', 'word2vec_jaccard_distance',\n",
    "       'freq_based_word2vec_cosine_distance',\n",
    "       'freq_based_word2vec_jaccard_distance',\n",
    "       'lda_balanced_euclidean_distance', 'lsi_cosine_distance',\n",
    "       'lsi_jaccard_distance', 'jellyfish_jaro_winkler_distance',\n",
    "       'smith_waterman_distance'\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Output the engineered features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'../data/processed_dataset/engineered_chars_test.csv'"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_output_path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_df.to_csv(train_output_path, index=False, encoding='utf-8')\n",
    "test_df.to_csv(test_output_path, index=False, encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 1,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
